In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# LOGISTIC REGRESSION IN PYTHON
# Import necessary libraries
! pip install pandas
! pip install numpy
! pip install scikit-learn
! pip install statsmodels
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
import numpy as np
In [8]:
# Load data from the web
url = "http://fs2.american.edu/~baron/627/R/depression_data.csv"
Depr = pd.read_csv(url)
# Remove rows with missing values
Depr1 = Depr.dropna()
In [15]:
# Convert categorical columns
Depr1.loc[:, 'Gender'] = Depr1['Gender'].astype('category').cat.codes
Depr1.loc[:, 'Guardian_status'] = Depr1['Guardian_status'].astype('category').cat.codes
In [16]:
# Define predictors (with constant for intercept) and outcome
X = sm.add_constant(Depr1[['Gender', 'Guardian_status', 'Cohesion_score']])
y = Depr1['Diagnosis']
In [ ]:
In [13]:
print(y.dtype)
print(X.dtypes)
float64 const float64 Gender object Guardian_status int64 Cohesion_score float64 dtype: object
In [19]:
y = y.astype(int)
X = X.apply(pd.to_numeric, errors='coerce') # converts all columns to numeric
In [20]:
# Fit the logistic regression model
logit_model = sm.Logit(y, X).fit()
print(logit_model.summary())
Optimization terminated successfully. Current function value: 0.393423 Iterations 7 Logit Regression Results ============================================================================== Dep. Variable: Diagnosis No. Observations: 458 Model: Logit Df Residuals: 454 Method: MLE Df Model: 3 Date: Mon, 23 Sep 2024 Pseudo R-squ.: 0.09559 Time: 11:45:40 Log-Likelihood: -180.19 converged: True LL-Null: -199.23 Covariance Type: nonrobust LLR p-value: 2.705e-08 =================================================================================== coef std err z P>|z| [0.025 0.975] ----------------------------------------------------------------------------------- const 1.0083 0.505 1.998 0.046 0.019 1.998 Gender -0.6874 0.288 -2.383 0.017 -1.253 -0.122 Guardian_status -0.7484 0.286 -2.616 0.009 -1.309 -0.188 Cohesion_score -0.0436 0.010 -4.167 0.000 -0.064 -0.023 ===================================================================================
In [21]:
# Predict probabilities
Prob = logit_model.predict(X)
# Classify based on threshold 0.3
YesPredict = (Prob > 0.3).astype(int)
# Confusion matrix
conf_matrix = confusion_matrix(y, YesPredict)
print(conf_matrix)
[[359 27] [ 48 24]]
In [22]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
# Fit logistic regression on training data
logit_model_train = sm.Logit(y_train, X_train).fit()
# Predict on test data
y_pred_test_prob = logit_model_train.predict(X_test)
y_pred_test = (y_pred_test_prob > 0.3).astype(int)
# Confusion matrix for test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print(conf_matrix_test)
Optimization terminated successfully. Current function value: 0.395923 Iterations 7 [[168 26] [ 24 11]]
In [23]:
! pip install matplotlib
import matplotlib.pyplot as plt
Requirement already satisfied: matplotlib in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (3.9.2) Requirement already satisfied: contourpy>=1.0.1 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (1.3.0) Requirement already satisfied: cycler>=0.10 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (4.54.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (1.4.7) Requirement already satisfied: numpy>=1.23 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (2.1.1) Requirement already satisfied: packaging>=20.0 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (24.1) Requirement already satisfied: pillow>=8 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (10.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (3.1.4) Requirement already satisfied: python-dateutil>=2.7 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\baron\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
In [24]:
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_test_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
In [25]:
# Predict for a specific case (e.g., Female, lives with both parents, Cohesion_score = 26)
new_data = pd.DataFrame({'const': 1, 'Gender': [0], 'Guardian_status': [1], 'Cohesion_score': [26]})
predicted_logit = logit_model_train.predict(new_data)
# Convert logit to probability
predicted_prob = np.exp(predicted_logit) / (1 + np.exp(predicted_logit))
print(f'Predicted Probability: {predicted_prob[0]:.2f}')
Predicted Probability: 0.59